library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1     ✔ purrr   0.3.4
## ✔ tibble  2.1.3     ✔ dplyr   0.8.3
## ✔ tidyr   1.0.0     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.4.0
## Warning: package 'purrr' was built under R version 3.6.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(dplyr) # Lirary for spliting train & test dataset
library(corrplot) # Plotting nice correlation matrix
## corrplot 0.84 loaded
library(caret)
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 3.6.2
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(pROC) # For checking ROC Curve of the model
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(ROCR)
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
library(bestglm)
## Loading required package: leaps
library(brglm)
## Loading required package: profileModel
## 'brglm' will gradually be superseded by 'brglm2' (https://cran.r-project.org/package=brglm2), which provides utilities for mean and median bias reduction for all GLMs and methods for the detection of infinite estimates in binomial-response models.
library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(ResourceSelection)
## ResourceSelection 0.3-5   2019-07-22
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(gmodels)
## 
## Attaching package: 'gmodels'
## The following object is masked from 'package:pROC':
## 
##     ci
library(mice)
## 
## Attaching package: 'mice'
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
library(nnet)
library(VIM)
## Loading required package: colorspace
## 
## Attaching package: 'colorspace'
## The following object is masked from 'package:pROC':
## 
##     coords
## Loading required package: grid
## Loading required package: data.table
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## The following object is masked from 'package:purrr':
## 
##     transpose
## VIM is ready to use. 
##  Since version 4.0.0 the GUI is in its own package VIMGUI.
## 
##           Please use the package to use the new (and old) GUI.
## Suggestions and bug-reports can be submitted at: https://github.com/alexkowa/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep
library(glmnet)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## Loaded glmnet 3.0-2
setwd("/Users/haidu/Desktop/statistics/Categorical\ Data\ Analysis\ /project")
loan <- read.csv("loan.csv", header=T, na.strings=c("","NA"))
loan_data <- loan
attach(loan_data)
loan_data <- dplyr::select(loan_data, -Loan_ID) #Removing Loan_ID as it has no logical corelation
head(loan_data)
ABCDEFGHIJ0123456789
 
 
Gender
<fctr>
Married
<fctr>
Dependents
<fctr>
Education
<fctr>
Self_Employed
<fctr>
ApplicantIncome
<int>
CoapplicantIncome
<dbl>
1MaleNo0GraduateNo58490
2MaleYes1GraduateNo45831508
3MaleYes0GraduateYes30000
4MaleYes0Not GraduateNo25832358
5MaleNo0GraduateNo60000
6MaleYes2GraduateYes54174196
sum(is.na(loan_data))  ## Checking for total missing values
## [1] 149
colSums(is.na(loan_data)) ##  checking for any missing values in the feature
##            Gender           Married        Dependents         Education 
##                13                 3                15                 0 
##     Self_Employed   ApplicantIncome CoapplicantIncome        LoanAmount 
##                32                 0                 0                22 
##  Loan_Amount_Term    Credit_History     Property_Area       Loan_Status 
##                14                50                 0                 0
aggr(loan,prop=FALSE,numbers=TRUE)

Explore data visually

ggplot(data=loan_data) +
  geom_point(aes(x=LoanAmount, y=Loan_Status, color=Property_Area)) 
## Warning: Removed 22 rows containing missing values (geom_point).

ggplot(data=loan_data) +
  geom_bar(aes(x=Loan_Status,color=Loan_Status)) 

ggplot(data=loan_data) +
  geom_bar(aes(x=Loan_Amount_Term, fill=Loan_Status ))
## Warning: Removed 14 rows containing non-finite values (stat_count).
## Warning: position_stack requires non-overlapping x intervals

ggplot(data=loan_data) +
  geom_histogram(aes(x=LoanAmount), bins = 50)
## Warning: Removed 22 rows containing non-finite values (stat_bin).

ggplot(data=loan_data) +
  geom_histogram(aes(x=ApplicantIncome))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=loan_data) +
  geom_histogram(aes(x=CoapplicantIncome ))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=loan_data) +
  geom_bar(aes(x=Credit_History, fill=Loan_Status))
## Warning: Removed 50 rows containing non-finite values (stat_count).

ggplot(data=loan_data) +
  geom_bar(aes(x=Dependents,fill=Loan_Status ))

ggplot(data=loan_data) +
  geom_bar(aes(x=Education, fill=Loan_Status))

ggplot(data=loan_data) +
  geom_bar(aes(x=Married, fill=Loan_Status))

#Making Contingency Table to check percentage of Credit_History in relation with  loan status
CrossTable(loan_data$Loan_Status, loan_data$Credit_History,prop.r = TRUE, prop.c = FALSE, prop.t = FALSE,
           prop.chisq = FALSE)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  564 
## 
##  
##                       | loan_data$Credit_History 
## loan_data$Loan_Status |         0 |         1 | Row Total | 
## ----------------------|-----------|-----------|-----------|
##                     N |        82 |        97 |       179 | 
##                       |     0.458 |     0.542 |     0.317 | 
## ----------------------|-----------|-----------|-----------|
##                     Y |         7 |       378 |       385 | 
##                       |     0.018 |     0.982 |     0.683 | 
## ----------------------|-----------|-----------|-----------|
##          Column Total |        89 |       475 |       564 | 
## ----------------------|-----------|-----------|-----------|
## 
## 
CrossTable(loan_data$Loan_Status, loan_data$Married,prop.r = TRUE, prop.c = FALSE, prop.t = FALSE,
           prop.chisq = FALSE )
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  611 
## 
##  
##                       | loan_data$Married 
## loan_data$Loan_Status |        No |       Yes | Row Total | 
## ----------------------|-----------|-----------|-----------|
##                     N |        79 |       113 |       192 | 
##                       |     0.411 |     0.589 |     0.314 | 
## ----------------------|-----------|-----------|-----------|
##                     Y |       134 |       285 |       419 | 
##                       |     0.320 |     0.680 |     0.686 | 
## ----------------------|-----------|-----------|-----------|
##          Column Total |       213 |       398 |       611 | 
## ----------------------|-----------|-----------|-----------|
## 
## 
CrossTable(loan_data$Loan_Status, loan_data$Education,prop.r = TRUE, prop.c = FALSE, prop.t = FALSE,
           prop.chisq = FALSE )
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  614 
## 
##  
##                       | loan_data$Education 
## loan_data$Loan_Status |     Graduate | Not Graduate |    Row Total | 
## ----------------------|--------------|--------------|--------------|
##                     N |          140 |           52 |          192 | 
##                       |        0.729 |        0.271 |        0.313 | 
## ----------------------|--------------|--------------|--------------|
##                     Y |          340 |           82 |          422 | 
##                       |        0.806 |        0.194 |        0.687 | 
## ----------------------|--------------|--------------|--------------|
##          Column Total |          480 |          134 |          614 | 
## ----------------------|--------------|--------------|--------------|
## 
## 
CrossTable(loan_data$Loan_Status, loan_data$Self_Employed,prop.r = TRUE, prop.c = FALSE, prop.t = FALSE,
           prop.chisq = FALSE )
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  582 
## 
##  
##                       | loan_data$Self_Employed 
## loan_data$Loan_Status |        No |       Yes | Row Total | 
## ----------------------|-----------|-----------|-----------|
##                     N |       157 |        26 |       183 | 
##                       |     0.858 |     0.142 |     0.314 | 
## ----------------------|-----------|-----------|-----------|
##                     Y |       343 |        56 |       399 | 
##                       |     0.860 |     0.140 |     0.686 | 
## ----------------------|-----------|-----------|-----------|
##          Column Total |       500 |        82 |       582 | 
## ----------------------|-----------|-----------|-----------|
## 
## 
CrossTable(loan_data$Loan_Status, loan_data$Property_Area,prop.r = TRUE, prop.c = FALSE, prop.t = FALSE,
           prop.chisq = FALSE )
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  614 
## 
##  
##                       | loan_data$Property_Area 
## loan_data$Loan_Status |     Rural | Semiurban |     Urban | Row Total | 
## ----------------------|-----------|-----------|-----------|-----------|
##                     N |        69 |        54 |        69 |       192 | 
##                       |     0.359 |     0.281 |     0.359 |     0.313 | 
## ----------------------|-----------|-----------|-----------|-----------|
##                     Y |       110 |       179 |       133 |       422 | 
##                       |     0.261 |     0.424 |     0.315 |     0.687 | 
## ----------------------|-----------|-----------|-----------|-----------|
##          Column Total |       179 |       233 |       202 |       614 | 
## ----------------------|-----------|-----------|-----------|-----------|
## 
## 

Handling Null, Missing and Categorical Variables

inputs the mean into the missing values, similary for categorical variable, we can use the category that apperars the most frequently

loan_data <- loan_data %>% 
  mutate(LoanAmount=ifelse(is.na(LoanAmount), mean(LoanAmount, na.rm = T), LoanAmount),
         Loan_Amount_Term=ifelse(is.na(Loan_Amount_Term), median(Loan_Amount_Term, na.rm = T), Loan_Amount_Term),
         Credit_History=ifelse(is.na(Credit_History), 1, Credit_History))

Handling Outlier

(Replace outlier with lower and upper cutoff value) using the rule of thumb where upper limit is computed as 1.5 * IRQ, where IRQ = 3rd Quartile – 1st Quartile.

#scatter plot  to detect outliers for ApplicantIncome
plot(ApplicantIncome, ylab = "ApplicantIncome")

outliers_upperlimit_AppIncome <- quantile(ApplicantIncome, 0.75) + 1.5 * IQR(ApplicantIncome) # upper_limit = 10171.25
index.outliers.ApplicantIncome <- which(ApplicantIncome > outliers_upperlimit_AppIncome | ApplicantIncome < 0 ) # 50 outliers
loan_data <- loan_data[-index.outliers.ApplicantIncome,] #Removing observations
plot(loan_data$ApplicantIncome, ylab = "ApplicantIncome") 

# detect outliers for CoapplicantIncome
plot(loan_data$CoapplicantIncome, ylab = "CoapplicantIncome")

outliers_upperlimit_CoIncome <- quantile(loan_data$CoapplicantIncome, 0.75) + 1.5 * IQR(loan_data$CoapplicantIncome) 
index.outliers.CoIncome <- which(loan_data$CoapplicantIncome > outliers_upperlimit_CoIncome | loan_data$CoapplicantIncome < 0 ) 
loan_data <- loan_data[-index.outliers.CoIncome,] #Removing observations
plot(loan_data$CoapplicantIncome, ylab = "CoapplicantIncome")  

# Treatment of outlier for LoanAmount
plot(loan_data$LoanAmount, ylab = "LoanAmount")

outliers_upperlimit_LoanAmount <- quantile(loan_data$LoanAmount, 0.75) + 1.5 * IQR(loan_data$LoanAmount) 
index.outliers.LoanAmount <- which(loan_data$LoanAmount > outliers_upperlimit_LoanAmount | loan_data$LoanAmount < 0 ) 
loan_data <- loan_data[-index.outliers.LoanAmount,] #Removing observations
plot(loan_data$LoanAmount, ylab = "LoanAmount")  

Transform the categorical data

create dummy variables for categorical attributes

#I converted the Dependents variable to a continuous variable in order
loan_data$Dependents=as.numeric(substr(loan_data$Dependents,1,1)) 

loan_data <- loan_data %>%
  mutate(Gender=ifelse(Gender=="Male",1,0),
         Married=ifelse(Married=="Yes",1,0),
         Education=ifelse(Education=="Graduate",1,0),
         Self_Employed=ifelse(Self_Employed=="Yes",1,0),
         Loan_Status=ifelse(Loan_Status=="Y",1,0))

#deal with missing value again, for catergorical data, use the category that appears most frequently
loan_data <- loan_data %>%
  mutate(Gender=ifelse(is.na(Gender),1,Gender),
         Married=ifelse(is.na(Married),1,Married),
         Dependents=ifelse(is.na(Dependents),0,Dependents),
         Self_Employed=ifelse(is.na(Self_Employed),0,Self_Employed))

# More than 2 unique values treatment
loan_data$Urban <- ifelse(loan_data$Property_Area=="Urban",1,0)
loan_data$Rural <- ifelse(loan_data$Property_Area=="Rural",1,0)
loan_data$Semiurban <-ifelse(loan_data$Property_Area=="Semiurban",1,0)

#check for correlation between the variables exploratory data analysis is to check correlations among all variables

cor(loan_data[, sapply(loan_data, class) != "factor" ],) #Checking multicollinearity 
##                         Gender      Married   Dependents    Education
## Gender             1.000000000  0.366896175  0.197637376 -0.077147385
## Married            0.366896175  1.000000000  0.347494857 -0.031658148
## Dependents         0.197637376  0.347494857  1.000000000 -0.091162626
## Education         -0.077147385 -0.031658148 -0.091162626  1.000000000
## Self_Employed      0.018360619  0.011510026  0.044635456 -0.002700106
## ApplicantIncome    0.058170211 -0.011708393  0.106643913  0.133652845
## CoapplicantIncome  0.198662627  0.257122438 -0.050769669  0.026163783
## LoanAmount         0.144713240  0.195775280  0.094648465  0.087518158
## Loan_Amount_Term  -0.081534683 -0.091292772 -0.097422737  0.070136775
## Credit_History     0.005464585  0.008026898 -0.030432194  0.083186603
## Loan_Status        0.051440973  0.083969054 -0.002540811  0.116144180
## Urban              0.045120925 -0.004504906  0.005676846  0.022884170
## Rural              0.074089533 -0.017468340 -0.036652822 -0.080294161
## Semiurban         -0.113276575  0.020836088  0.029235952  0.054044306
##                   Self_Employed ApplicantIncome CoapplicantIncome
## Gender              0.018360619     0.058170211       0.198662627
## Married             0.011510026    -0.011708393       0.257122438
## Dependents          0.044635456     0.106643913      -0.050769669
## Education          -0.002700106     0.133652845       0.026163783
## Self_Employed       1.000000000     0.179528002      -0.045701089
## ApplicantIncome     0.179528002     1.000000000      -0.270461490
## CoapplicantIncome  -0.045701089    -0.270461490       1.000000000
## LoanAmount          0.090048388     0.436923090       0.310754050
## Loan_Amount_Term   -0.066294038    -0.074934591      -0.010446208
## Credit_History      0.029057610     0.053453818       0.001293975
## Loan_Status        -0.013524023     0.019987144       0.065063190
## Urban              -0.073516055    -0.088117010      -0.053587233
## Rural               0.041375963     0.096143868       0.085539168
## Semiurban           0.031238872    -0.006589329      -0.029613571
##                     LoanAmount Loan_Amount_Term Credit_History
## Gender             0.144713240      -0.08153468    0.005464585
## Married            0.195775280      -0.09129277    0.008026898
## Dependents         0.094648465      -0.09742274   -0.030432194
## Education          0.087518158       0.07013678    0.083186603
## Self_Employed      0.090048388      -0.06629404    0.029057610
## ApplicantIncome    0.436923090      -0.07493459    0.053453818
## CoapplicantIncome  0.310754050      -0.01044621    0.001293975
## LoanAmount         1.000000000       0.07692013   -0.005444425
## Loan_Amount_Term   0.076920126       1.00000000   -0.018932317
## Credit_History    -0.005444425      -0.01893232    1.000000000
## Loan_Status       -0.021883151      -0.02571920    0.569831570
## Urban             -0.147243718      -0.10615095   -0.007679229
## Rural              0.109511523       0.03674422   -0.031672381
## Semiurban          0.037368052       0.06686150    0.037310664
##                    Loan_Status        Urban       Rural    Semiurban
## Gender             0.051440973  0.045120925  0.07408953 -0.113276575
## Married            0.083969054 -0.004504906 -0.01746834  0.020836088
## Dependents        -0.002540811  0.005676846 -0.03665282  0.029235952
## Education          0.116144180  0.022884170 -0.08029416  0.054044306
## Self_Employed     -0.013524023 -0.073516055  0.04137596  0.031238872
## ApplicantIncome    0.019987144 -0.088117010  0.09614387 -0.006589329
## CoapplicantIncome  0.065063190 -0.053587233  0.08553917 -0.029613571
## LoanAmount        -0.021883151 -0.147243718  0.10951152  0.037368052
## Loan_Amount_Term  -0.025719205 -0.106150948  0.03674422  0.066861503
## Credit_History     0.569831570 -0.007679229 -0.03167238  0.037310664
## Loan_Status        1.000000000 -0.042447024 -0.10757726  0.142393338
## Urban             -0.042447024  1.000000000 -0.44790480 -0.533621281
## Rural             -0.107577256 -0.447904799  1.00000000 -0.517134043
## Semiurban          0.142393338 -0.533621281 -0.51713404  1.000000000
# graph check multicollinearity
correlation <- loan_data[sapply(loan_data, is.numeric)]
descrCorr <- cor(correlation)
corrplot(descrCorr)

pairs.panels(loan_data)

Building Logistic Regression Model1 by using Multiple Imputation dataset

# let's take a look initial model
head(loan_data)
ABCDEFGHIJ0123456789
 
 
Gender
<dbl>
Married
<dbl>
Dependents
<dbl>
Education
<dbl>
Self_Employed
<dbl>
ApplicantIncome
<int>
CoapplicantIncome
<dbl>
LoanAmount
<dbl>
11001058490146.4122
21111045831508128.0000
3110113000066.0000
41100025832358120.0000
51001060000141.0000
6110002333151695.0000
loan_data_R <- dplyr::select(loan_data, -Property_Area)
reg1 <- glm(Loan_Status~., family = binomial, data = loan_data_R)
summary(reg1)
## 
## Call:
## glm(formula = Loan_Status ~ ., family = binomial, data = loan_data_R)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2938  -0.3139   0.5005   0.6809   2.6244  
## 
## Coefficients: (1 not defined because of singularities)
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       -2.541e+00  9.550e-01  -2.660 0.007806 ** 
## Gender             3.093e-01  3.257e-01   0.950 0.342319    
## Married            4.423e-01  2.920e-01   1.515 0.129762    
## Dependents        -2.877e-02  1.366e-01  -0.211 0.833221    
## Education          4.837e-01  2.740e-01   1.766 0.077460 .  
## Self_Employed     -3.218e-01  3.669e-01  -0.877 0.380433    
## ApplicantIncome    1.019e-04  8.949e-05   1.139 0.254630    
## CoapplicantIncome  2.072e-04  1.152e-04   1.799 0.072011 .  
## LoanAmount        -7.525e-03  4.023e-03  -1.871 0.061383 .  
## Loan_Amount_Term  -4.842e-04  1.937e-03  -0.250 0.802634    
## Credit_History     4.215e+00  4.910e-01   8.585  < 2e-16 ***
## Urban             -8.709e-01  3.097e-01  -2.812 0.004920 ** 
## Rural             -1.012e+00  3.054e-01  -3.315 0.000917 ***
## Semiurban                 NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 641.76  on 521  degrees of freedom
## Residual deviance: 449.00  on 509  degrees of freedom
## AIC: 475
## 
## Number of Fisher Scoring iterations: 5
cor(loan_data_R$Loan_Status, fitted(reg1))  #0.60
## [1] 0.6014094
hoslem.test(loan_data_R$Loan_Status, fitted(reg1))  #p-value = 0.4663
## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  loan_data_R$Loan_Status, fitted(reg1)
## X-squared = 7.6703, df = 8, p-value = 0.4663
####################################################################################

#mice treating missing valued by using the loan data = origional data

loan <- dplyr::select(loan,-Loan_ID)

aggr(loan,prop=FALSE,numbers=TRUE)

aggr(loan,prop=TRUE,numbers=TRUE)

#Multiple Imputation
imps = mice(loan)
## 
##  iter imp variable
##   1   1  Gender  Married  Dependents  Self_Employed  LoanAmount  Loan_Amount_Term  Credit_History
##   1   2  Gender  Married  Dependents  Self_Employed  LoanAmount  Loan_Amount_Term  Credit_History
##   1   3  Gender  Married  Dependents  Self_Employed  LoanAmount  Loan_Amount_Term  Credit_History
##   1   4  Gender  Married  Dependents  Self_Employed  LoanAmount  Loan_Amount_Term  Credit_History
##   1   5  Gender  Married  Dependents  Self_Employed  LoanAmount  Loan_Amount_Term  Credit_History
##   2   1  Gender  Married  Dependents  Self_Employed  LoanAmount  Loan_Amount_Term  Credit_History
##   2   2  Gender  Married  Dependents  Self_Employed  LoanAmount  Loan_Amount_Term  Credit_History
##   2   3  Gender  Married  Dependents  Self_Employed  LoanAmount  Loan_Amount_Term  Credit_History
##   2   4  Gender  Married  Dependents  Self_Employed  LoanAmount  Loan_Amount_Term  Credit_History
##   2   5  Gender  Married  Dependents  Self_Employed  LoanAmount  Loan_Amount_Term  Credit_History
##   3   1  Gender  Married  Dependents  Self_Employed  LoanAmount  Loan_Amount_Term  Credit_History
##   3   2  Gender  Married  Dependents  Self_Employed  LoanAmount  Loan_Amount_Term  Credit_History
##   3   3  Gender  Married  Dependents  Self_Employed  LoanAmount  Loan_Amount_Term  Credit_History
##   3   4  Gender  Married  Dependents  Self_Employed  LoanAmount  Loan_Amount_Term  Credit_History
##   3   5  Gender  Married  Dependents  Self_Employed  LoanAmount  Loan_Amount_Term  Credit_History
##   4   1  Gender  Married  Dependents  Self_Employed  LoanAmount  Loan_Amount_Term  Credit_History
##   4   2  Gender  Married  Dependents  Self_Employed  LoanAmount  Loan_Amount_Term  Credit_History
##   4   3  Gender  Married  Dependents  Self_Employed  LoanAmount  Loan_Amount_Term  Credit_History
##   4   4  Gender  Married  Dependents  Self_Employed  LoanAmount  Loan_Amount_Term  Credit_History
##   4   5  Gender  Married  Dependents  Self_Employed  LoanAmount  Loan_Amount_Term  Credit_History
##   5   1  Gender  Married  Dependents  Self_Employed  LoanAmount  Loan_Amount_Term  Credit_History
##   5   2  Gender  Married  Dependents  Self_Employed  LoanAmount  Loan_Amount_Term  Credit_History
##   5   3  Gender  Married  Dependents  Self_Employed  LoanAmount  Loan_Amount_Term  Credit_History
##   5   4  Gender  Married  Dependents  Self_Employed  LoanAmount  Loan_Amount_Term  Credit_History
##   5   5  Gender  Married  Dependents  Self_Employed  LoanAmount  Loan_Amount_Term  Credit_History
library(lattice)
densityplot(imps)

#What we would like to see is that the shape of the magenta points (imputed) matches the shape of the blue ones (observed). The matching shape tells us that the imputed values are indeed “plausible values”.The density of the imputed data for each imputed dataset is showed in magenta while the density of the observed data is showed in blue. Again, under our previous assumptions we expect the distributions to be similar.

#mice reg model
fit_mcie = with(imps, glm(loan$Loan_Status~., family = binomial, data = loan))
Final_micefit <- pool(fit_mcie)
summary(Final_micefit)
ABCDEFGHIJ0123456789
term
<fctr>
estimate
<dbl>
std.error
<dbl>
statistic
<dbl>
df
<dbl>
p.value
<dbl>
(Intercept)-2.429153e+009.311516e-01-2.6087617462.9669.380985e-03
GenderMale3.253805e-013.309205e-010.9832590462.9663.259935e-01
MarriedYes5.738867e-012.924218e-011.9625307462.9665.029919e-02
Dependents1-3.756212e-013.460433e-01-1.0854745462.9662.782764e-01
Dependents22.770496e-013.781582e-010.7326289462.9664.641556e-01
Dependents3+1.883805e-014.874361e-010.3864722462.9666.993246e-01
EducationNot Graduate-4.209972e-013.032836e-01-1.3881303462.9661.657650e-01
Self_EmployedYes-1.491721e-013.523418e-01-0.4233733462.9666.722195e-01
ApplicantIncome6.945049e-062.862075e-050.2426578462.9668.083780e-01
CoapplicantIncome-5.142946e-054.307183e-05-1.1940393462.9662.330740e-01
# porpuslful selection
#step1
summary(pool(with(imps, glm(loan$Loan_Status~Gender, family = binomial, data = loan))))
ABCDEFGHIJ0123456789
term
<fctr>
estimate
<dbl>
std.error
<dbl>
statistic
<dbl>
df
<dbl>
p.value
<dbl>
(Intercept)0.70657020.20089883.5170447596.94940.0004695274
GenderMale0.10879460.22355500.4866569596.94940.6266800834
summary(pool(with(imps, glm(loan$Loan_Status~Married, family = binomial, data = loan))))
ABCDEFGHIJ0123456789
term
<fctr>
estimate
<dbl>
std.error
<dbl>
statistic
<dbl>
df
<dbl>
p.value
<dbl>
(Intercept)0.52839190.14184823.725052606.94820.0002134864
MarriedYes0.39670940.18021992.201252606.94820.0280938792
summary(pool(with(imps, glm(loan$Loan_Status~Dependents, family = binomial, data = loan))))
ABCDEFGHIJ0123456789
term
<fctr>
estimate
<dbl>
std.error
<dbl>
statistic
<dbl>
df
<dbl>
p.value
<dbl>
(Intercept)0.79944180.11639366.8684360592.94991.643929e-11
Dependents1-0.19330600.2376484-0.8134120592.94994.163084e-01
Dependents20.31241570.25827361.2096306592.94992.269027e-01
Dependents3+-0.19330600.3152872-0.6131109592.94995.400381e-01
summary(pool(with(imps, glm(loan$Loan_Status~Education, family = binomial, data = loan))))
ABCDEFGHIJ0123456789
term
<fctr>
estimate
<dbl>
std.error
<dbl>
statistic
<dbl>
df
<dbl>
p.value
<dbl>
(Intercept)0.88730320.10041928.835989609.94780.00000000
EducationNot Graduate-0.43182770.2037398-2.119506609.94780.03445109
summary(pool(with(imps, glm(loan$Loan_Status~Self_Employed, family = binomial, data = loan))))
ABCDEFGHIJ0123456789
term
<fctr>
estimate
<dbl>
std.error
<dbl>
statistic
<dbl>
df
<dbl>
p.value
<dbl>
(Intercept)0.781484640.096358048.11021709577.95173.108624e-15
Self_EmployedYes-0.014229490.25613184-0.05555533577.95179.557152e-01
summary(pool(with(imps, glm(loan$Loan_Status~ApplicantIncome, family = binomial, data = loan))))
ABCDEFGHIJ0123456789
term
<fctr>
estimate
<dbl>
std.error
<dbl>
statistic
<dbl>
df
<dbl>
p.value
<dbl>
(Intercept)7.964151e-011.158793e-016.8727999609.94781.558997e-11
ApplicantIncome-1.644623e-061.409642e-05-0.1166696609.94789.071603e-01
summary(pool(with(imps, glm(loan$Loan_Status~CoapplicantIncome, family = binomial, data = loan))))
ABCDEFGHIJ0123456789
term
<fctr>
estimate
<dbl>
std.error
<dbl>
statistic
<dbl>
df
<dbl>
p.value
<dbl>
(Intercept)8.551532e-010.10002247578.549611609.94780.0000000
CoapplicantIncome-4.065626e-050.0000291521-1.394626609.94780.1636364
summary(pool(with(imps, glm(loan$Loan_Status~LoanAmount, family = binomial, data = loan))))
ABCDEFGHIJ0123456789
term
<fctr>
estimate
<dbl>
std.error
<dbl>
statistic
<dbl>
df
<dbl>
p.value
<dbl>
(Intercept)0.95511429700.1748324485.4630265587.95056.919104e-08
LoanAmount-0.00091453550.001010431-0.9050949587.95053.657858e-01
summary(pool(with(imps, glm(loan$Loan_Status~Loan_Amount_Term, family = binomial, data = loan))))
ABCDEFGHIJ0123456789
term
<fctr>
estimate
<dbl>
std.error
<dbl>
statistic
<dbl>
df
<dbl>
p.value
<dbl>
(Intercept)1.04772812610.4846326542.1619016595.94950.03102309
Loan_Amount_Term-0.00072274290.001388447-0.5205404595.94950.60288028
summary(pool(with(imps, glm(loan$Loan_Status~Credit_History, family = binomial, data = loan))))
ABCDEFGHIJ0123456789
term
<fctr>
estimate
<dbl>
std.error
<dbl>
statistic
<dbl>
df
<dbl>
p.value
<dbl>
(Intercept)-2.4608090.3937666-6.249410559.95388.167031e-10
Credit_History3.8209920.40988659.322075559.95380.000000e+00
summary(pool(with(imps, glm(loan$Loan_Status~Property_Area, family = binomial, data = loan))))
ABCDEFGHIJ0123456789
term
<fctr>
estimate
<dbl>
std.error
<dbl>
statistic
<dbl>
df
<dbl>
p.value
<dbl>
(Intercept)0.46637390.15356973.0368866608.94790.0024925769
Property_AreaSemiurban0.73202790.21837753.3521213608.94790.0008516376
Property_AreaUrban0.18986880.21353030.8891887608.94790.3742528040
#we find that Married, Education, CoapplicantIncome, and Credit_History have p-values less than 0.2. 

summary(pool(with(imps, glm(loan$Loan_Status~Married+Education+CoapplicantIncome
                            +Credit_History, family = binomial, data = loan))))
ABCDEFGHIJ0123456789
term
<fctr>
estimate
<dbl>
std.error
<dbl>
statistic
<dbl>
df
<dbl>
p.value
<dbl>
(Intercept)-2.652940e+004.338735e-01-6.114547553.95461.829263e-09
MarriedYes5.437148e-012.260484e-012.405302553.95461.648597e-02
EducationNot Graduate-4.025707e-012.600842e-01-1.547848553.95461.222300e-01
CoapplicantIncome-4.696803e-053.947181e-05-1.189913553.95462.345902e-01
Credit_History3.835977e+004.125966e-019.297161553.95460.000000e+00
#step2
summary(pool(with(imps, glm(loan$Loan_Status~Married+Education+Credit_History, family = binomial, data = loan))))
ABCDEFGHIJ0123456789
term
<fctr>
estimate
<dbl>
std.error
<dbl>
statistic
<dbl>
df
<dbl>
p.value
<dbl>
(Intercept)-2.71344930.4317363-6.284969554.95446.638152e-10
MarriedYes0.51797870.22464892.305725554.95442.149379e-02
EducationNot Graduate-0.37845720.2587755-1.462493554.95441.441723e-01
Credit_History3.83239380.41248529.290985554.95440.000000e+00
summary(pool(with(imps, glm(loan$Loan_Status~Married+Credit_History, family = binomial, data = loan))))
ABCDEFGHIJ0123456789
term
<fctr>
estimate
<dbl>
std.error
<dbl>
statistic
<dbl>
df
<dbl>
p.value
<dbl>
(Intercept)-2.81941960.4276450-6.592897555.95431.004119e-10
MarriedYes0.52045950.22435752.319777555.95432.071421e-02
Credit_History3.85356140.41237199.344870555.95430.000000e+00
#step3 
summary(pool(with(imps, glm(loan$Loan_Status~Married+Credit_History       
                            +Dependents, family = binomial, data = loan))))
ABCDEFGHIJ0123456789
term
<fctr>
estimate
<dbl>
std.error
<dbl>
statistic
<dbl>
df
<dbl>
p.value
<dbl>
(Intercept)-2.78125200.4341829-6.4057156540.95623.255318e-10
MarriedYes0.49617880.24577522.0188316540.95624.399756e-02
Credit_History3.80506330.41573409.1526380540.95620.000000e+00
Dependents1-0.15761650.3113703-0.5062029540.95626.129204e-01
Dependents20.22116810.34465440.6417098540.95625.213335e-01
Dependents3+0.17047950.44175880.3859108540.95626.997145e-01
summary(pool(with(imps, glm(loan$Loan_Status~Credit_History+Married     
                            +Education, family = binomial, data = loan))))
ABCDEFGHIJ0123456789
term
<fctr>
estimate
<dbl>
std.error
<dbl>
statistic
<dbl>
df
<dbl>
p.value
<dbl>
(Intercept)-2.71344930.4317363-6.284969554.95446.638152e-10
Credit_History3.83239380.41248529.290985554.95440.000000e+00
MarriedYes0.51797870.22464892.305725554.95442.149379e-02
EducationNot Graduate-0.37845720.2587755-1.462493554.95441.441723e-01
summary(pool(with(imps, glm(loan$Loan_Status~Credit_History+Married     
                            +Self_Employed, family = binomial, data = loan))))
ABCDEFGHIJ0123456789
term
<fctr>
estimate
<dbl>
std.error
<dbl>
statistic
<dbl>
df
<dbl>
p.value
<dbl>
(Intercept)-2.800681300.4305185-6.5053683528.95761.798754e-10
Credit_History3.808467960.41313999.2183489528.95760.000000e+00
MarriedYes0.531050290.22901412.3188540528.95762.078313e-02
Self_EmployedYes-0.091295370.3184625-0.2866754528.95767.744731e-01
summary(pool(with(imps, glm(loan$Loan_Status~Credit_History+Married     
                            +ApplicantIncome, family = binomial, data = loan))))
ABCDEFGHIJ0123456789
term
<fctr>
estimate
<dbl>
std.error
<dbl>
statistic
<dbl>
df
<dbl>
p.value
<dbl>
(Intercept)-2.816835e+004.389002e-01-6.41794023554.95442.965130e-10
Credit_History3.853415e+004.124028e-019.34381434554.95440.000000e+00
MarriedYes5.207964e-012.247314e-012.31741693554.95442.084395e-02
ApplicantIncome-4.892081e-071.877092e-05-0.02606203554.95449.792172e-01
summary(pool(with(imps, glm(loan$Loan_Status~Credit_History+Married     
                            +CoapplicantIncome, family = binomial, data = loan))))
ABCDEFGHIJ0123456789
term
<fctr>
estimate
<dbl>
std.error
<dbl>
statistic
<dbl>
df
<dbl>
p.value
<dbl>
(Intercept)-2.769323e+000.4295733695-6.446682554.95442.486571e-10
Credit_History3.856305e+000.41240107179.350862554.95440.000000e+00
MarriedYes5.435283e-010.22569286312.408265554.95441.635357e-02
CoapplicantIncome-4.221221e-050.0000391437-1.078391554.95442.813279e-01
summary(pool(with(imps, glm(loan$Loan_Status~Credit_History+Married     
                            +LoanAmount, family = binomial, data = loan))))       # only Loan_amount change Marries by at 10%
ABCDEFGHIJ0123456789
term
<fctr>
estimate
<dbl>
std.error
<dbl>
statistic
<dbl>
df
<dbl>
p.value
<dbl>
(Intercept)-2.5932092180.456561772-5.679865534.95692.214888e-08
Credit_History3.8512879690.4150028699.280148534.95690.000000e+00
MarriedYes0.5976906460.2344350212.549494534.95691.106581e-02
LoanAmount-0.0015784250.001302304-1.212026534.95692.260376e-01
summary(pool(with(imps, glm(loan$Loan_Status~Credit_History+Married     
                            +Loan_Amount_Term, family = binomial, data = loan))))
ABCDEFGHIJ0123456789
term
<fctr>
estimate
<dbl>
std.error
<dbl>
statistic
<dbl>
df
<dbl>
p.value
<dbl>
(Intercept)-2.53812652950.750962937-3.3798293540.95620.0007776295
Credit_History3.75720390120.4138352099.0789856540.95620.0000000000
MarriedYes0.51790379870.2260657152.2909436540.95620.0223503389
Loan_Amount_Term-0.00059796450.001780349-0.3358692540.95620.7370997362
summary(pool(with(imps, glm(loan$Loan_Status~Credit_History+Married     
                            +Credit_History, family = binomial, data = loan))))
ABCDEFGHIJ0123456789
term
<fctr>
estimate
<dbl>
std.error
<dbl>
statistic
<dbl>
df
<dbl>
p.value
<dbl>
(Intercept)-2.81941960.4276450-6.592897555.95431.004119e-10
Credit_History3.85356140.41237199.344870555.95430.000000e+00
MarriedYes0.52045950.22435752.319777555.95432.071421e-02
summary(pool(with(imps, glm(loan$Loan_Status~Credit_History+Married     
                            +Property_Area, family = binomial, data = loan))))
ABCDEFGHIJ0123456789
term
<fctr>
estimate
<dbl>
std.error
<dbl>
statistic
<dbl>
df
<dbl>
p.value
<dbl>
(Intercept)-3.31066420.4700045-7.0438985553.95465.567324e-12
Credit_History3.94124020.42014519.3806634553.95460.000000e+00
MarriedYes0.52350120.22759792.3001146553.95462.181253e-02
Property_AreaSemiurban1.01702210.28261723.5985854553.95463.486179e-04
Property_AreaUrban0.19837750.26428200.7506281553.95464.531952e-01
#step4:Attempt adding plausible interactions among variables in the model, usually using somewhat stricter standards such a p-value<0.05 (can consider non-linear predictor terms, like quadratic effects, in this step as well).

summary(pool(with(imps, glm(loan$Loan_Status~Credit_History+Married     
                            +LoanAmount + Credit_History:Married, family = binomial, data = loan))))   
ABCDEFGHIJ0123456789
term
<fctr>
estimate
<dbl>
std.error
<dbl>
statistic
<dbl>
df
<dbl>
p.value
<dbl>
(Intercept)-2.0388892640.628114204-3.246049533.9571.243569e-03
Credit_History3.2602443030.6332661355.148300533.9573.701468e-07
MarriedYes-0.2416602590.800493627-0.301889533.9577.628542e-01
LoanAmount-0.0015915220.001312593-1.212502533.9572.258563e-01
Credit_History:MarriedYes0.9078925690.8344533621.088009533.9572.770821e-01
summary(pool(with(imps, glm(loan$Loan_Status~Credit_History+Married     
                            +LoanAmount + Married:LoanAmount, family = binomial, data = loan))))    
ABCDEFGHIJ0123456789
term
<fctr>
estimate
<dbl>
std.error
<dbl>
statistic
<dbl>
df
<dbl>
p.value
<dbl>
(Intercept)-2.4297482160.535116244-4.5405989533.9576.938592e-06
Credit_History3.8509700960.4146926249.2863241533.9570.000000e+00
MarriedYes0.3588635960.4703077860.7630399533.9574.457766e-01
LoanAmount-0.0028325260.002490300-1.1374237533.9572.558714e-01
MarriedYes:LoanAmount0.0017242080.0029334970.5877654533.9575.569382e-01
summary(pool(with(imps, glm(loan$Loan_Status~Credit_History+Married     
                            +LoanAmount + Credit_History:LoanAmount,
                            family = binomial, data = loan))))     # Credit_History:LoanAmount p = 0.034
ABCDEFGHIJ0123456789
term
<fctr>
estimate
<dbl>
std.error
<dbl>
statistic
<dbl>
df
<dbl>
p.value
<dbl>
(Intercept)-3.6611020270.734296000-4.985867533.9578.352020e-07
Credit_History5.0322703840.7556412366.659603533.9576.837664e-11
MarriedYes0.6033717020.2366317882.549834533.9571.105569e-02
LoanAmount0.0049296710.0031812451.549605533.9571.218289e-01
Credit_History:LoanAmount-0.0072825600.003425296-2.126111533.9573.395215e-02
summary(pool(with(imps, glm(loan$Loan_Status~Credit_History+Married     
                            +LoanAmount+Credit_History:LoanAmount+I(LoanAmount^2), family = binomial, data = loan))))     # Credit_History:LoanAmount p = 0.046
ABCDEFGHIJ0123456789
term
<fctr>
estimate
<dbl>
std.error
<dbl>
statistic
<dbl>
df
<dbl>
p.value
<dbl>
(Intercept)-3.464251e+008.361211e-01-4.1432409532.95713.981945e-05
Credit_History4.984143e+007.583655e-016.5722168532.95711.182234e-10
MarriedYes6.132121e-012.375298e-012.5816220532.95711.009963e-02
LoanAmount2.972242e-035.181382e-030.5736388532.95715.664542e-01
I(LoanAmount^2)3.132992e-066.551381e-060.4782185532.95716.326908e-01
Credit_History:LoanAmount-6.989972e-033.490626e-03-2.0024982532.95714.573727e-02
# Conclusion:model included: Credit_History,  Married, LoanAmount,  Credit_History*LoanAmount


########################################################################
#complete the missing value by using final model predictor see what's going on
one_of_dataset <- complete(imps,1)
model1 <-glm(Loan_Status~Credit_History+Married+LoanAmount+
                        Credit_History:LoanAmount, family = binomial, data = one_of_dataset)

rocplotone11 <- roc(one_of_dataset$Loan_Status ~ fitted(model1), data = one_of_dataset)
## Setting levels: control = N, case = Y
## Setting direction: controls < cases
plot.roc(rocplotone11, print.auc = TRUE)

auc(rocplotone11)      #0.78 Auc depdent on different dataset
## Area under the curve: 0.7704
#using different complete dataset and stepAIC() see what's going on, 
stepAIC(glm(Loan_Status~., family = binomial, data = one_of_dataset))
## Start:  AIC=575.12
## Loan_Status ~ Gender + Married + Dependents + Education + Self_Employed + 
##     ApplicantIncome + CoapplicantIncome + LoanAmount + Loan_Amount_Term + 
##     Credit_History + Property_Area
## 
##                     Df Deviance    AIC
## - Dependents         3   547.68 571.68
## - Self_Employed      1   545.12 573.12
## - Gender             1   545.13 573.13
## - Loan_Amount_Term   1   545.38 573.38
## - ApplicantIncome    1   545.48 573.48
## - CoapplicantIncome  1   545.87 573.87
## - LoanAmount         1   547.00 575.00
## <none>                   545.12 575.12
## - Education          1   547.69 575.69
## - Married            1   551.09 579.09
## - Property_Area      2   558.46 584.46
## - Credit_History     1   733.48 761.48
## 
## Step:  AIC=571.68
## Loan_Status ~ Gender + Married + Education + Self_Employed + 
##     ApplicantIncome + CoapplicantIncome + LoanAmount + Loan_Amount_Term + 
##     Credit_History + Property_Area
## 
##                     Df Deviance    AIC
## - Self_Employed      1   547.68 569.68
## - Gender             1   547.72 569.72
## - Loan_Amount_Term   1   547.83 569.83
## - ApplicantIncome    1   548.06 570.06
## - CoapplicantIncome  1   548.28 570.28
## <none>                   547.68 571.68
## - LoanAmount         1   549.70 571.70
## - Education          1   550.24 572.24
## - Married            1   554.09 576.09
## - Property_Area      2   560.50 580.50
## - Credit_History     1   736.94 758.94
## 
## Step:  AIC=569.68
## Loan_Status ~ Gender + Married + Education + ApplicantIncome + 
##     CoapplicantIncome + LoanAmount + Loan_Amount_Term + Credit_History + 
##     Property_Area
## 
##                     Df Deviance    AIC
## - Gender             1   547.72 567.72
## - Loan_Amount_Term   1   547.83 567.83
## - ApplicantIncome    1   548.06 568.06
## - CoapplicantIncome  1   548.28 568.28
## <none>                   547.68 569.68
## - LoanAmount         1   549.71 569.71
## - Education          1   550.25 570.25
## - Married            1   554.09 574.09
## - Property_Area      2   560.50 578.50
## - Credit_History     1   737.00 757.00
## 
## Step:  AIC=567.72
## Loan_Status ~ Married + Education + ApplicantIncome + CoapplicantIncome + 
##     LoanAmount + Loan_Amount_Term + Credit_History + Property_Area
## 
##                     Df Deviance    AIC
## - Loan_Amount_Term   1   547.88 565.88
## - ApplicantIncome    1   548.10 566.10
## - CoapplicantIncome  1   548.29 566.29
## <none>                   547.72 567.72
## - LoanAmount         1   549.72 567.72
## - Education          1   550.26 568.26
## - Married            1   555.37 573.37
## - Property_Area      2   560.54 576.54
## - Credit_History     1   737.13 755.13
## 
## Step:  AIC=565.88
## Loan_Status ~ Married + Education + ApplicantIncome + CoapplicantIncome + 
##     LoanAmount + Credit_History + Property_Area
## 
##                     Df Deviance    AIC
## - ApplicantIncome    1   548.33 564.33
## - CoapplicantIncome  1   548.42 564.42
## <none>                   547.88 565.88
## - LoanAmount         1   550.06 566.06
## - Education          1   550.32 566.32
## - Married            1   555.95 571.95
## - Property_Area      2   560.66 574.66
## - Credit_History     1   737.27 753.27
## 
## Step:  AIC=564.33
## Loan_Status ~ Married + Education + CoapplicantIncome + LoanAmount + 
##     Credit_History + Property_Area
## 
##                     Df Deviance    AIC
## - CoapplicantIncome  1   549.21 563.21
## - LoanAmount         1   550.18 564.18
## <none>                   548.33 564.33
## - Education          1   550.84 564.84
## - Married            1   556.29 570.29
## - Property_Area      2   561.06 573.06
## - Credit_History     1   737.29 751.29
## 
## Step:  AIC=563.21
## Loan_Status ~ Married + Education + LoanAmount + Credit_History + 
##     Property_Area
## 
##                  Df Deviance    AIC
## <none>                549.21 563.21
## - LoanAmount      1   551.32 563.32
## - Education       1   551.53 563.53
## - Married         1   556.81 568.81
## - Property_Area   2   562.08 572.08
## - Credit_History  1   739.34 751.34
## 
## Call:  glm(formula = Loan_Status ~ Married + Education + LoanAmount + 
##     Credit_History + Property_Area, family = binomial, data = one_of_dataset)
## 
## Coefficients:
##            (Intercept)              MarriedYes   EducationNot Graduate  
##               -3.01061                 0.61467                -0.39980  
##             LoanAmount          Credit_History  Property_AreaSemiurban  
##               -0.00172                 4.00501                 0.89510  
##     Property_AreaUrban  
##                0.15809  
## 
## Degrees of Freedom: 613 Total (i.e. Null);  607 Residual
## Null Deviance:       762.9 
## Residual Deviance: 549.2     AIC: 563.2
#run the smallest AIC
aci_model<-glm(formula = Loan_Status ~ Married + Credit_History + Property_Area, 
    family = binomial, data = one_of_dataset)

#ROC
rocplotone <- roc(one_of_dataset$Loan_Status ~ fitted(aci_model), data = one_of_dataset)
## Setting levels: control = N, case = Y
## Setting direction: controls < cases
plot.roc(rocplotone, print.auc = TRUE)

auc(rocplotone)
## Area under the curve: 0.7904

Building Logistic Regression Model2 withdataset that replaced by using the mean of each feature separately

#Purposeful Model Selection  
##step1 Fit “simple” logistic regression models for each of the predictors separately.
#Eliminate any predictor values with large p-values (say >0.2).

summary(glm(Loan_Status ~ Gender, family=binomial, data = loan_data_R))
## 
## Call:
## glm(formula = Loan_Status ~ Gender, family = binomial, data = loan_data_R)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.5666  -1.4421   0.8330   0.8330   0.9341  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)   0.6035     0.2102   2.871  0.00409 **
## Gender        0.2766     0.2358   1.173  0.24075   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 641.76  on 521  degrees of freedom
## Residual deviance: 640.40  on 520  degrees of freedom
## AIC: 644.4
## 
## Number of Fisher Scoring iterations: 4
summary(glm(Loan_Status ~ Married, family=binomial, data = loan_data_R))
## 
## Call:
## glm(formula = Loan_Status ~ Married, family = binomial, data = loan_data_R)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6047  -1.4358   0.8036   0.8036   0.9394  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   0.5895     0.1535   3.841 0.000123 ***
## Married       0.3751     0.1960   1.914 0.055617 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 641.76  on 521  degrees of freedom
## Residual deviance: 638.12  on 520  degrees of freedom
## AIC: 642.12
## 
## Number of Fisher Scoring iterations: 4
summary(glm(Loan_Status ~ Dependents, family=binomial, data = loan_data_R))
## 
## Call:
## glm(formula = Loan_Status ~ Dependents, family = binomial, data = loan_data_R)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.5437  -1.5387   0.8510   0.8529   0.8569  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.829448   0.116986   7.090 1.34e-12 ***
## Dependents  -0.005578   0.096089  -0.058    0.954    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 641.76  on 521  degrees of freedom
## Residual deviance: 641.75  on 520  degrees of freedom
## AIC: 645.75
## 
## Number of Fisher Scoring iterations: 4
summary(glm(Loan_Status ~ Education, family=binomial, data = loan_data_R))
## 
## Call:
## glm(formula = Loan_Status ~ Education, family = binomial, data = loan_data_R)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6099  -1.3594   0.7997   0.7997   1.0057  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)   0.4182     0.1786   2.342  0.01919 * 
## Education     0.5579     0.2116   2.637  0.00836 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 641.76  on 521  degrees of freedom
## Residual deviance: 634.93  on 520  degrees of freedom
## AIC: 638.93
## 
## Number of Fisher Scoring iterations: 4
summary(glm(Loan_Status ~ Self_Employed, family=binomial, data = loan_data_R))
## 
## Call:
## glm(formula = Loan_Status ~ Self_Employed, family = binomial, 
##     data = loan_data_R)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.5467  -1.5467   0.8486   0.8486   0.8817  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    0.83601    0.10119   8.262   <2e-16 ***
## Self_Employed -0.09157    0.29643  -0.309    0.757    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 641.76  on 521  degrees of freedom
## Residual deviance: 641.66  on 520  degrees of freedom
## AIC: 645.66
## 
## Number of Fisher Scoring iterations: 4
summary(glm(Loan_Status ~ ApplicantIncome, family=binomial, data = loan_data_R))
## 
## Call:
## glm(formula = Loan_Status ~ ApplicantIncome, family = binomial, 
##     data = loan_data_R)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6084  -1.5253   0.8464   0.8599   0.8857  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)   
## (Intercept)     7.284e-01  2.323e-01   3.136  0.00171 **
## ApplicantIncome 2.445e-05  5.357e-05   0.457  0.64802   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 641.76  on 521  degrees of freedom
## Residual deviance: 641.55  on 520  degrees of freedom
## AIC: 645.55
## 
## Number of Fisher Scoring iterations: 4
summary(glm(Loan_Status ~ CoapplicantIncome, family=binomial, data = loan_data_R))
## 
## Call:
## glm(formula = Loan_Status ~ CoapplicantIncome, family = binomial, 
##     data = loan_data_R)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.7310  -1.4821   0.8210   0.9007   0.9007  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       6.927e-01  1.288e-01   5.379  7.5e-08 ***
## CoapplicantIncome 1.042e-04  7.027e-05   1.483    0.138    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 641.76  on 521  degrees of freedom
## Residual deviance: 639.51  on 520  degrees of freedom
## AIC: 643.51
## 
## Number of Fisher Scoring iterations: 4
summary(glm(Loan_Status ~ LoanAmount, family=binomial, data = loan_data_R))
## 
## Call:
## glm(formula = Loan_Status ~ LoanAmount, family = binomial, data = loan_data_R)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6031  -1.5224   0.8424   0.8568   0.9001  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.971599   0.308215   3.152  0.00162 **
## LoanAmount  -0.001175   0.002351  -0.500  0.61718   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 641.76  on 521  degrees of freedom
## Residual deviance: 641.51  on 520  degrees of freedom
## AIC: 645.51
## 
## Number of Fisher Scoring iterations: 4
summary(glm(Loan_Status ~ Loan_Amount_Term, family=binomial, data = loan_data_R))
## 
## Call:
## glm(formula = Loan_Status ~ Loan_Amount_Term, family = binomial, 
##     data = loan_data_R)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6623  -1.5352   0.8577   0.8577   0.8956  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)  
## (Intercept)       1.1236351  0.5179354   2.169    0.030 *
## Loan_Amount_Term -0.0008693  0.0014812  -0.587    0.557  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 641.76  on 521  degrees of freedom
## Residual deviance: 641.41  on 520  degrees of freedom
## AIC: 645.41
## 
## Number of Fisher Scoring iterations: 4
summary(glm(Loan_Status ~ Credit_History, family=binomial, data = loan_data_R))
## 
## Call:
## glm(formula = Loan_Status ~ Credit_History, family = binomial, 
##     data = loan_data_R)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8068  -0.3664   0.6596   0.6596   2.3385  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     -2.6672     0.4625  -5.767 8.06e-09 ***
## Credit_History   4.0819     0.4777   8.545  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 641.76  on 521  degrees of freedom
## Residual deviance: 476.77  on 520  degrees of freedom
## AIC: 480.77
## 
## Number of Fisher Scoring iterations: 5
summary(glm(Loan_Status ~ Urban, family=binomial, data = loan_data_R))
## 
## Call:
## glm(formula = Loan_Status ~ Urban, family = binomial, data = loan_data_R)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.5706  -1.4823   0.8299   0.8299   0.9005  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   0.8890     0.1165   7.632 2.31e-14 ***
## Urban        -0.1959     0.2021  -0.969    0.332    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 641.76  on 521  degrees of freedom
## Residual deviance: 640.82  on 520  degrees of freedom
## AIC: 644.82
## 
## Number of Fisher Scoring iterations: 4
summary(glm(Loan_Status ~ Rural, family=binomial, data = loan_data_R))
## 
## Call:
## glm(formula = Loan_Status ~ Rural, family = binomial, data = loan_data_R)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6137  -1.3916   0.7968   0.7968   0.9774  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   0.9846     0.1178   8.359   <2e-16 ***
## Rural        -0.4940     0.2019  -2.447   0.0144 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 641.76  on 521  degrees of freedom
## Residual deviance: 635.84  on 520  degrees of freedom
## AIC: 639.84
## 
## Number of Fisher Scoring iterations: 4
summary(glm(formula=Loan_Status ~ Married + Education + CoapplicantIncome + Credit_History + Rural, family=binomial, data = loan_data_R))
## 
## Call:
## glm(formula = Loan_Status ~ Married + Education + CoapplicantIncome + 
##     Credit_History + Rural, family = binomial, data = loan_data_R)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1357  -0.3367   0.5501   0.6936   2.6328  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       -3.263e+00  5.423e-01  -6.017 1.78e-09 ***
## Married            4.288e-01  2.469e-01   1.736   0.0825 .  
## Education          4.215e-01  2.613e-01   1.613   0.1067    
## CoapplicantIncome  1.313e-04  9.166e-05   1.432   0.1521    
## Credit_History     4.143e+00  4.838e-01   8.564  < 2e-16 ***
## Rural             -5.927e-01  2.482e-01  -2.388   0.0169 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 641.76  on 521  degrees of freedom
## Residual deviance: 461.17  on 516  degrees of freedom
## AIC: 473.17
## 
## Number of Fisher Scoring iterations: 5
#step2 Conduct forward stepwise selection with remaining predictors, usually using a more stringent cut-off, such as p-value<0.1 or perhaps AIC/BIC. 

summary(glm(formula=Loan_Status ~ Married + Education + Credit_History + Rural, family=binomial, data=loan_data_R))
## 
## Call:
## glm(formula = Loan_Status ~ Married + Education + Credit_History + 
##     Rural, family = binomial, data = loan_data_R)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0279  -0.3518   0.5233   0.6631   2.5710  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     -3.1672     0.5365  -5.903 3.57e-09 ***
## Married          0.5165     0.2389   2.162   0.0306 *  
## Education        0.4509     0.2603   1.732   0.0832 .  
## Credit_History   4.1191     0.4820   8.546  < 2e-16 ***
## Rural           -0.5514     0.2455  -2.246   0.0247 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 641.76  on 521  degrees of freedom
## Residual deviance: 463.30  on 517  degrees of freedom
## AIC: 473.3
## 
## Number of Fisher Scoring iterations: 5
#step3 Consider adding in any variables that were not included in the model after Step 1 or Step 2.  A predictor can be added in even if p-value>0.1 if the AIC/BIC is lower or if it changes the estimated β coefficients by at least, say, 10%.

summary(glm(formula=Loan_Status ~ Married + Education + Credit_History 
            + Rural + Urban, family=binomial, data = loan_data_R))            # chnage rural by at 50%
## 
## Call:
## glm(formula = Loan_Status ~ Married + Education + Credit_History + 
##     Rural + Urban, family = binomial, data = loan_data_R)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1969  -0.2944   0.5423   0.6690   2.5945  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     -2.8709     0.5516  -5.204 1.95e-07 ***
## Married          0.5246     0.2406   2.180  0.02923 *  
## Education        0.4767     0.2632   1.811  0.07011 .  
## Credit_History   4.1890     0.4883   8.579  < 2e-16 ***
## Rural           -0.9362     0.2979  -3.142  0.00168 ** 
## Urban           -0.7706     0.3011  -2.559  0.01050 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 641.76  on 521  degrees of freedom
## Residual deviance: 456.59  on 516  degrees of freedom
## AIC: 468.59
## 
## Number of Fisher Scoring iterations: 5
summary(glm(formula=Loan_Status ~ Married + Education + Credit_History 
            + Rural + Semiurban, family=binomial, data = loan_data_R))        
## 
## Call:
## glm(formula = Loan_Status ~ Married + Education + Credit_History + 
##     Rural + Semiurban, family = binomial, data = loan_data_R)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1969  -0.2944   0.5423   0.6690   2.5945  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     -3.6415     0.5797  -6.282 3.35e-10 ***
## Married          0.5246     0.2406   2.180   0.0292 *  
## Education        0.4767     0.2632   1.811   0.0701 .  
## Credit_History   4.1890     0.4883   8.579  < 2e-16 ***
## Rural           -0.1656     0.2812  -0.589   0.5559    
## Semiurban        0.7706     0.3011   2.559   0.0105 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 641.76  on 521  degrees of freedom
## Residual deviance: 456.59  on 516  degrees of freedom
## AIC: 468.59
## 
## Number of Fisher Scoring iterations: 5
summary(glm(formula=Loan_Status ~ Married + Education + Credit_History 
            + Rural + Loan_Amount_Term, family=binomial, data = loan_data_R))        
## 
## Call:
## glm(formula = Loan_Status ~ Married + Education + Credit_History + 
##     Rural + Loan_Amount_Term, family = binomial, data = loan_data_R)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0434  -0.3515   0.5253   0.6644   2.5703  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -3.0398138  0.8326495  -3.651 0.000261 ***
## Married           0.5123867  0.2398351   2.136 0.032646 *  
## Education         0.4547076  0.2610463   1.742 0.081532 .  
## Credit_History    4.1169297  0.4819874   8.542  < 2e-16 ***
## Rural            -0.5472129  0.2463555  -2.221 0.026335 *  
## Loan_Amount_Term -0.0003705  0.0018553  -0.200 0.841729    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 641.76  on 521  degrees of freedom
## Residual deviance: 463.26  on 516  degrees of freedom
## AIC: 475.26
## 
## Number of Fisher Scoring iterations: 5
summary(glm(formula=Loan_Status ~ Married + Education + Credit_History 
            + Rural + LoanAmount, family=binomial, data = loan_data_R))  
## 
## Call:
## glm(formula = Loan_Status ~ Married + Education + Credit_History + 
##     Rural + LoanAmount, family = binomial, data = loan_data_R)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1061  -0.3487   0.5358   0.6623   2.5788  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -2.886694   0.611223  -4.723 2.33e-06 ***
## Married         0.565866   0.245036   2.309   0.0209 *  
## Education       0.489188   0.264156   1.852   0.0640 .  
## Credit_History  4.125726   0.482439   8.552  < 2e-16 ***
## Rural          -0.519677   0.247802  -2.097   0.0360 *  
## LoanAmount     -0.002857   0.003028  -0.943   0.3455    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 641.76  on 521  degrees of freedom
## Residual deviance: 462.40  on 516  degrees of freedom
## AIC: 474.4
## 
## Number of Fisher Scoring iterations: 5
summary(glm(formula=Loan_Status ~ Married + Education + Credit_History 
            + Rural + CoapplicantIncome, family=binomial, data = loan_data_R))        #change married by aroudn 18%
## 
## Call:
## glm(formula = Loan_Status ~ Married + Education + Credit_History + 
##     Rural + CoapplicantIncome, family = binomial, data = loan_data_R)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1357  -0.3367   0.5501   0.6936   2.6328  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       -3.263e+00  5.423e-01  -6.017 1.78e-09 ***
## Married            4.288e-01  2.469e-01   1.736   0.0825 .  
## Education          4.215e-01  2.613e-01   1.613   0.1067    
## Credit_History     4.143e+00  4.838e-01   8.564  < 2e-16 ***
## Rural             -5.927e-01  2.482e-01  -2.388   0.0169 *  
## CoapplicantIncome  1.313e-04  9.166e-05   1.432   0.1521    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 641.76  on 521  degrees of freedom
## Residual deviance: 461.17  on 516  degrees of freedom
## AIC: 473.17
## 
## Number of Fisher Scoring iterations: 5
summary(glm(formula=Loan_Status ~ Married + Education + Credit_History 
            + Rural + ApplicantIncome, family=binomial, data = loan_data_R))      
## 
## Call:
## glm(formula = Loan_Status ~ Married + Education + Credit_History + 
##     Rural + ApplicantIncome, family = binomial, data = loan_data_R)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0528  -0.3545   0.5261   0.6632   2.5762  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     -3.106e+00  5.746e-01  -5.406 6.43e-08 ***
## Married          5.164e-01  2.390e-01   2.161   0.0307 *  
## Education        4.619e-01  2.632e-01   1.755   0.0792 .  
## Credit_History   4.125e+00  4.824e-01   8.549  < 2e-16 ***
## Rural           -5.434e-01  2.471e-01  -2.199   0.0279 *  
## ApplicantIncome -1.902e-05  6.506e-05  -0.292   0.7700    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 641.76  on 521  degrees of freedom
## Residual deviance: 463.21  on 516  degrees of freedom
## AIC: 475.21
## 
## Number of Fisher Scoring iterations: 5
summary(glm(formula=Loan_Status ~ Married + Education + Credit_History 
            + Rural + Self_Employed, family=binomial, data = loan_data_R))     
## 
## Call:
## glm(formula = Loan_Status ~ Married + Education + Credit_History + 
##     Rural + Self_Employed, family = binomial, data = loan_data_R)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0430  -0.3561   0.5147   0.6537   2.5629  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     -3.1500     0.5367  -5.870 4.37e-09 ***
## Married          0.5199     0.2391   2.174   0.0297 *  
## Education        0.4516     0.2603   1.735   0.0828 .  
## Credit_History   4.1330     0.4828   8.561  < 2e-16 ***
## Rural           -0.5476     0.2457  -2.228   0.0259 *  
## Self_Employed   -0.2762     0.3492  -0.791   0.4290    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 641.76  on 521  degrees of freedom
## Residual deviance: 462.69  on 516  degrees of freedom
## AIC: 474.69
## 
## Number of Fisher Scoring iterations: 5
summary(glm(formula=Loan_Status ~ Married + Education + Credit_History 
            + Rural + Dependents, family=binomial, data = loan_data_R))        
## 
## Call:
## glm(formula = Loan_Status ~ Married + Education + Credit_History + 
##     Rural + Dependents, family = binomial, data = loan_data_R)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0447  -0.3581   0.5288   0.6624   2.5682  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -3.14722    0.53957  -5.833 5.45e-09 ***
## Married         0.54758    0.25908   2.114   0.0346 *  
## Education       0.44203    0.26163   1.690   0.0911 .  
## Credit_History  4.11596    0.48196   8.540  < 2e-16 ***
## Rural          -0.55483    0.24578  -2.257   0.0240 *  
## Dependents     -0.04106    0.13060  -0.314   0.7532    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 641.76  on 521  degrees of freedom
## Residual deviance: 463.20  on 516  degrees of freedom
## AIC: 475.2
## 
## Number of Fisher Scoring iterations: 5
# we adding back semiurban first


summary(glm(formula=Loan_Status ~ Married + Education + Credit_History 
            + Rural + Urban, family=binomial, data = loan_data_R))        # chnage rural by more than 50%
## 
## Call:
## glm(formula = Loan_Status ~ Married + Education + Credit_History + 
##     Rural + Urban, family = binomial, data = loan_data_R)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1969  -0.2944   0.5423   0.6690   2.5945  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     -2.8709     0.5516  -5.204 1.95e-07 ***
## Married          0.5246     0.2406   2.180  0.02923 *  
## Education        0.4767     0.2632   1.811  0.07011 .  
## Credit_History   4.1890     0.4883   8.579  < 2e-16 ***
## Rural           -0.9362     0.2979  -3.142  0.00168 ** 
## Urban           -0.7706     0.3011  -2.559  0.01050 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 641.76  on 521  degrees of freedom
## Residual deviance: 456.59  on 516  degrees of freedom
## AIC: 468.59
## 
## Number of Fisher Scoring iterations: 5
summary(glm(formula=Loan_Status ~ Married + Education + Credit_History 
            + Rural + Urban + CoapplicantIncome, family=binomial, data = loan_data_R))       #chnage married about 17% and rural about 30%
## 
## Call:
## glm(formula = Loan_Status ~ Married + Education + Credit_History + 
##     Rural + Urban + CoapplicantIncome, family = binomial, data = loan_data_R)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2741  -0.3060   0.5240   0.6792   2.6515  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       -2.959e+00  5.570e-01  -5.313 1.08e-07 ***
## Married            4.394e-01  2.487e-01   1.767  0.07728 .  
## Education          4.486e-01  2.641e-01   1.698  0.08943 .  
## Credit_History     4.204e+00  4.891e-01   8.597  < 2e-16 ***
## Rural             -9.742e-01  3.003e-01  -3.244  0.00118 ** 
## Urban             -7.634e-01  3.015e-01  -2.533  0.01132 *  
## CoapplicantIncome  1.280e-04  9.235e-05   1.386  0.16573    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 641.76  on 521  degrees of freedom
## Residual deviance: 454.60  on 515  degrees of freedom
## AIC: 468.6
## 
## Number of Fisher Scoring iterations: 5
summary(glm(formula=Loan_Status ~ Married + Education + Credit_History 
            + Rural + Urban + CoapplicantIncome + LoanAmount, family=binomial, data = loan_data_R))       #chnage over 10%
## 
## Call:
## glm(formula = Loan_Status ~ Married + Education + Credit_History + 
##     Rural + Urban + CoapplicantIncome + LoanAmount, family = binomial, 
##     data = loan_data_R)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2628  -0.3130   0.5124   0.6822   2.6779  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       -2.462e+00  6.382e-01  -3.858 0.000114 ***
## Married            5.058e-01  2.533e-01   1.997 0.045843 *  
## Education          4.996e-01  2.676e-01   1.867 0.061942 .  
## Credit_History     4.214e+00  4.892e-01   8.613  < 2e-16 ***
## Rural             -9.516e-01  3.012e-01  -3.159 0.001581 ** 
## Urban             -8.112e-01  3.039e-01  -2.669 0.007608 ** 
## CoapplicantIncome  1.608e-04  9.459e-05   1.699 0.089227 .  
## LoanAmount        -4.948e-03  3.150e-03  -1.571 0.116229    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 641.76  on 521  degrees of freedom
## Residual deviance: 452.12  on 514  degrees of freedom
## AIC: 468.12
## 
## Number of Fisher Scoring iterations: 5
summary(glm(formula=Loan_Status ~ Married + Education + Credit_History 
            + Rural + Urban + CoapplicantIncome + LoanAmount + Dependents, family=binomial, data = loan_data_R))      
## 
## Call:
## glm(formula = Loan_Status ~ Married + Education + Credit_History + 
##     Rural + Urban + CoapplicantIncome + LoanAmount + Dependents, 
##     family = binomial, data = loan_data_R)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2625  -0.3135   0.5114   0.6830   2.6770  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       -2.459e+00  6.404e-01  -3.840 0.000123 ***
## Married            5.121e-01  2.762e-01   1.854 0.063757 .  
## Education          4.982e-01  2.687e-01   1.854 0.063767 .  
## Credit_History     4.213e+00  4.894e-01   8.608  < 2e-16 ***
## Rural             -9.526e-01  3.017e-01  -3.158 0.001590 ** 
## Urban             -8.115e-01  3.040e-01  -2.669 0.007598 ** 
## CoapplicantIncome  1.600e-04  9.559e-05   1.673 0.094262 .  
## LoanAmount        -4.934e-03  3.160e-03  -1.561 0.118423    
## Dependents        -7.637e-03  1.342e-01  -0.057 0.954607    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 641.76  on 521  degrees of freedom
## Residual deviance: 452.11  on 513  degrees of freedom
## AIC: 470.11
## 
## Number of Fisher Scoring iterations: 5
summary(glm(formula=Loan_Status ~ Married + Education + Credit_History 
            + Rural + Urban + CoapplicantIncome + LoanAmount + ApplicantIncome, family=binomial, data = loan_data_R))     
## 
## Call:
## glm(formula = Loan_Status ~ Married + Education + Credit_History + 
##     Rural + Urban + CoapplicantIncome + LoanAmount + ApplicantIncome, 
##     family = binomial, data = loan_data_R)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2676  -0.3163   0.5071   0.6855   2.6864  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       -2.554e+00  6.465e-01  -3.951 7.78e-05 ***
## Married            5.092e-01  2.546e-01   2.000  0.04549 *  
## Education          4.621e-01  2.697e-01   1.713  0.08666 .  
## Credit_History     4.196e+00  4.891e-01   8.580  < 2e-16 ***
## Rural             -9.849e-01  3.031e-01  -3.249  0.00116 ** 
## Urban             -8.128e-01  3.043e-01  -2.671  0.00757 ** 
## CoapplicantIncome  2.284e-04  1.129e-04   2.022  0.04316 *  
## LoanAmount        -7.751e-03  3.991e-03  -1.942  0.05209 .  
## ApplicantIncome    1.025e-04  8.779e-05   1.167  0.24302    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 641.76  on 521  degrees of freedom
## Residual deviance: 450.71  on 513  degrees of freedom
## AIC: 468.71
## 
## Number of Fisher Scoring iterations: 5
#the following are just roughtly change near 10% but p valur is very large we just ingore

summary(glm(formula=Loan_Status ~ Married + Education + Credit_History 
            + Rural + Urban + CoapplicantIncome + LoanAmount + Self_Employed, family=binomial, data = loan_data_R))      
## 
## Call:
## glm(formula = Loan_Status ~ Married + Education + Credit_History + 
##     Rural + Urban + CoapplicantIncome + LoanAmount + Self_Employed, 
##     family = binomial, data = loan_data_R)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2759  -0.3131   0.5116   0.6847   2.6676  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       -2.464e+00  6.386e-01  -3.859 0.000114 ***
## Married            5.078e-01  2.535e-01   2.003 0.045169 *  
## Education          4.981e-01  2.677e-01   1.861 0.062781 .  
## Credit_History     4.226e+00  4.901e-01   8.623  < 2e-16 ***
## Rural             -9.544e-01  3.015e-01  -3.165 0.001549 ** 
## Urban             -8.239e-01  3.048e-01  -2.703 0.006870 ** 
## CoapplicantIncome  1.543e-04  9.482e-05   1.627 0.103752    
## LoanAmount        -4.682e-03  3.175e-03  -1.475 0.140219    
## Self_Employed     -2.444e-01  3.601e-01  -0.679 0.497322    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 641.76  on 521  degrees of freedom
## Residual deviance: 451.67  on 513  degrees of freedom
## AIC: 469.67
## 
## Number of Fisher Scoring iterations: 5
#4Attempt adding plausible interactions among variables in the model,considering interactions as follows does not produce anything extra

summary(glm(formula=Loan_Status ~ Married + Education + Credit_History 
            + Rural + Urban + CoapplicantIncome + LoanAmount + LoanAmount:Credit_History, family=binomial, data = loan_data_R))       
## 
## Call:
## glm(formula = Loan_Status ~ Married + Education + Credit_History + 
##     Rural + Urban + CoapplicantIncome + LoanAmount + LoanAmount:Credit_History, 
##     family = binomial, data = loan_data_R)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2495  -0.2977   0.5147   0.6857   2.6780  
## 
## Coefficients:
##                             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)               -4.435e+00  1.887e+00  -2.351  0.01875 * 
## Married                    5.121e-01  2.543e-01   2.014  0.04402 * 
## Education                  5.217e-01  2.681e-01   1.946  0.05164 . 
## Credit_History             6.273e+00  1.908e+00   3.288  0.00101 **
## Rural                     -9.429e-01  3.008e-01  -3.135  0.00172 **
## Urban                     -8.127e-01  3.046e-01  -2.668  0.00763 **
## CoapplicantIncome          1.496e-04  9.406e-05   1.591  0.11165   
## LoanAmount                 9.990e-03  1.300e-02   0.769  0.44218   
## Credit_History:LoanAmount -1.570e-02  1.328e-02  -1.182  0.23732   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 641.76  on 521  degrees of freedom
## Residual deviance: 450.73  on 513  degrees of freedom
## AIC: 468.73
## 
## Number of Fisher Scoring iterations: 5
summary(glm(formula=Loan_Status ~ Married + Education + Credit_History 
            + Rural + Urban + CoapplicantIncome + LoanAmount + LoanAmount:Education, family=binomial, data = loan_data_R))    
## 
## Call:
## glm(formula = Loan_Status ~ Married + Education + Credit_History + 
##     Rural + Urban + CoapplicantIncome + LoanAmount + LoanAmount:Education, 
##     family = binomial, data = loan_data_R)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2661  -0.3124   0.5107   0.6849   2.6760  
## 
## Coefficients:
##                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          -2.229e+00  9.291e-01  -2.399  0.01644 *  
## Married               5.133e-01  2.541e-01   2.020  0.04338 *  
## Education             1.961e-01  9.171e-01   0.214  0.83067    
## Credit_History        4.204e+00  4.897e-01   8.586  < 2e-16 ***
## Rural                -9.477e-01  3.017e-01  -3.142  0.00168 ** 
## Urban                -7.999e-01  3.056e-01  -2.618  0.00885 ** 
## CoapplicantIncome     1.596e-04  9.474e-05   1.685  0.09201 .  
## LoanAmount           -6.917e-03  6.508e-03  -1.063  0.28786    
## Education:LoanAmount  2.521e-03  7.274e-03   0.346  0.72897    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 641.76  on 521  degrees of freedom
## Residual deviance: 452.00  on 513  degrees of freedom
## AIC: 470
## 
## Number of Fisher Scoring iterations: 5
summary(glm(formula=Loan_Status ~ Married + Education + Credit_History
            + Rural+ Urban + LoanAmount +CoapplicantIncome + Married:Credit_History, family=binomial, data = loan_data_R))
## 
## Call:
## glm(formula = Loan_Status ~ Married + Education + Credit_History + 
##     Rural + Urban + LoanAmount + CoapplicantIncome + Married:Credit_History, 
##     family = binomial, data = loan_data_R)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2706  -0.3055   0.5095   0.6820   2.5214  
## 
## Coefficients:
##                          Estimate Std. Error z value Pr(>|z|)    
## (Intercept)            -2.034e+00  8.471e-01  -2.401  0.01636 *  
## Married                -1.363e-01  9.628e-01  -0.142  0.88741    
## Education               4.919e-01  2.688e-01   1.830  0.06726 .  
## Credit_History          3.765e+00  7.689e-01   4.897 9.75e-07 ***
## Rural                  -9.492e-01  3.016e-01  -3.147  0.00165 ** 
## Urban                  -8.100e-01  3.038e-01  -2.666  0.00767 ** 
## LoanAmount             -4.963e-03  3.153e-03  -1.574  0.11554    
## CoapplicantIncome       1.645e-04  9.503e-05   1.731  0.08340 .  
## Married:Credit_History  6.798e-01  9.880e-01   0.688  0.49142    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 641.76  on 521  degrees of freedom
## Residual deviance: 451.66  on 513  degrees of freedom
## AIC: 469.66
## 
## Number of Fisher Scoring iterations: 5
summary(glm(formula=Loan_Status ~ Married + Education + Credit_History
            + Rural+ Urban + LoanAmount +CoapplicantIncome + LoanAmount:CoapplicantIncome, family=binomial, data = loan_data_R))
## 
## Call:
## glm(formula = Loan_Status ~ Married + Education + Credit_History + 
##     Rural + Urban + LoanAmount + CoapplicantIncome + LoanAmount:CoapplicantIncome, 
##     family = binomial, data = loan_data_R)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4184  -0.3083   0.4902   0.6895   2.6676  
## 
## Coefficients:
##                                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                  -2.839e+00  6.865e-01  -4.135 3.55e-05 ***
## Married                       4.746e-01  2.554e-01   1.858  0.06317 .  
## Education                     5.250e-01  2.699e-01   1.945  0.05173 .  
## Credit_History                4.212e+00  4.899e-01   8.598  < 2e-16 ***
## Rural                        -9.738e-01  3.023e-01  -3.222  0.00127 ** 
## Urban                        -8.372e-01  3.055e-01  -2.740  0.00614 ** 
## LoanAmount                   -1.859e-03  3.764e-03  -0.494  0.62137    
## CoapplicantIncome             6.299e-04  3.346e-04   1.883  0.05977 .  
## LoanAmount:CoapplicantIncome -3.303e-06  2.217e-06  -1.490  0.13623    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 641.76  on 521  degrees of freedom
## Residual deviance: 449.92  on 513  degrees of freedom
## AIC: 467.92
## 
## Number of Fisher Scoring iterations: 5
# thus, the finla model included these presictors
# Married, Credit_History, Urban, Rural,Education, CoapplicantIncome, LoanAmount

hand_best_model_dataset <-dplyr::select(loan_data_R, Married, Credit_History, Urban, Rural,Education, CoapplicantIncome, Loan_Status, LoanAmount)

bestModel <- glm(Loan_Status ~ Married + Credit_History + Urban + Rural 
                 + Education + CoapplicantIncome + LoanAmount, family = binomial, data = loan_data_R)
summary(bestModel)
## 
## Call:
## glm(formula = Loan_Status ~ Married + Credit_History + Urban + 
##     Rural + Education + CoapplicantIncome + LoanAmount, family = binomial, 
##     data = loan_data_R)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2628  -0.3130   0.5124   0.6822   2.6779  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       -2.462e+00  6.382e-01  -3.858 0.000114 ***
## Married            5.058e-01  2.533e-01   1.997 0.045843 *  
## Credit_History     4.214e+00  4.892e-01   8.613  < 2e-16 ***
## Urban             -8.112e-01  3.039e-01  -2.669 0.007608 ** 
## Rural             -9.516e-01  3.012e-01  -3.159 0.001581 ** 
## Education          4.996e-01  2.676e-01   1.867 0.061942 .  
## CoapplicantIncome  1.608e-04  9.459e-05   1.699 0.089227 .  
## LoanAmount        -4.948e-03  3.150e-03  -1.571 0.116229    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 641.76  on 521  degrees of freedom
## Residual deviance: 452.12  on 514  degrees of freedom
## AIC: 468.12
## 
## Number of Fisher Scoring iterations: 5
cor(hand_best_model_dataset$Loan_Status, fitted(bestModel)) # R=0.6 is useful for comparing fits of different models for the same data.
## [1] 0.5973614
#ROC
rocBestModel <- roc(hand_best_model_dataset$Loan_Status ~ fitted(bestModel))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot.roc(rocBestModel, print.auc = TRUE)

auc(rocBestModel) #0.81
## Area under the curve: 0.8108
#hoslem.test
hoslem.test(loan_data_R$Loan_Status, fitted(bestModel)) #0.06623
## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  loan_data_R$Loan_Status, fitted(bestModel)
## X-squared = 14.654, df = 8, p-value = 0.06623
#Confusion Matrix
fit123 = predict(bestModel, newdata = loan_data_R, type = 'response') 
# If results are more than 50% then convert to 1 else 0
fit123 = ifelse(fit123 >=0.5,1,0) #Setting cut-off to be at 0.5
# Evaluate predictions on the training dataset through Confusion Matrix
cnf_matrix = table(predicted = fit123, actual = loan_data_R$Loan_Status)
cnf_matrix
##          actual
## predicted   0   1
##         0  72   5
##         1  87 358
TN = cnf_matrix[1,1] # True Negative - Actual & Predicted is 0/N
TP = cnf_matrix[2,2] # True Positive - Actual & Predicted is 1/Y
FP = cnf_matrix[2,1] # False Positive - Actual is 0/N but Predicted is 1/Y
FN = cnf_matrix[1,2] # False Nefgative - Actual is 1/Y but Predicted is 0/N
TO = TN+TP+FP+FN # Total Observations

accuracy = (TP+TN)/TO # Accuracy or Prevalance of Confusion Matrix = 0.82
accuracy
## [1] 0.8237548
precision = TP/(TP+FP) # Precision = 0.80
precision
## [1] 0.8044944
sensitivity = TP/(TP+FN) # True Positive Rate = 0.98
sensitivity
## [1] 0.9862259
error = (FP+FN)/TO # Error Rate=0.18
error
## [1] 0.1762452
specificity = TN/(TN+FP)
specificity      #0.45
## [1] 0.4528302
G=sqrt(specificity*sensitivity) ##G-mean=0.67
G
## [1] 0.668276
####################################################################################
#Aautomatic backward selection
summary(reg1)
## 
## Call:
## glm(formula = Loan_Status ~ ., family = binomial, data = loan_data_R)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2938  -0.3139   0.5005   0.6809   2.6244  
## 
## Coefficients: (1 not defined because of singularities)
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       -2.541e+00  9.550e-01  -2.660 0.007806 ** 
## Gender             3.093e-01  3.257e-01   0.950 0.342319    
## Married            4.423e-01  2.920e-01   1.515 0.129762    
## Dependents        -2.877e-02  1.366e-01  -0.211 0.833221    
## Education          4.837e-01  2.740e-01   1.766 0.077460 .  
## Self_Employed     -3.218e-01  3.669e-01  -0.877 0.380433    
## ApplicantIncome    1.019e-04  8.949e-05   1.139 0.254630    
## CoapplicantIncome  2.072e-04  1.152e-04   1.799 0.072011 .  
## LoanAmount        -7.525e-03  4.023e-03  -1.871 0.061383 .  
## Loan_Amount_Term  -4.842e-04  1.937e-03  -0.250 0.802634    
## Credit_History     4.215e+00  4.910e-01   8.585  < 2e-16 ***
## Urban             -8.709e-01  3.097e-01  -2.812 0.004920 ** 
## Rural             -1.012e+00  3.054e-01  -3.315 0.000917 ***
## Semiurban                 NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 641.76  on 521  degrees of freedom
## Residual deviance: 449.00  on 509  degrees of freedom
## AIC: 475
## 
## Number of Fisher Scoring iterations: 5
reg2 <- update(reg1,.~.-Semiurban)
summary(reg2)
## 
## Call:
## glm(formula = Loan_Status ~ Gender + Married + Dependents + Education + 
##     Self_Employed + ApplicantIncome + CoapplicantIncome + LoanAmount + 
##     Loan_Amount_Term + Credit_History + Urban + Rural, family = binomial, 
##     data = loan_data_R)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2938  -0.3139   0.5005   0.6809   2.6244  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       -2.541e+00  9.550e-01  -2.660 0.007806 ** 
## Gender             3.093e-01  3.257e-01   0.950 0.342319    
## Married            4.423e-01  2.920e-01   1.515 0.129762    
## Dependents        -2.877e-02  1.366e-01  -0.211 0.833221    
## Education          4.837e-01  2.740e-01   1.766 0.077460 .  
## Self_Employed     -3.218e-01  3.669e-01  -0.877 0.380433    
## ApplicantIncome    1.019e-04  8.949e-05   1.139 0.254630    
## CoapplicantIncome  2.072e-04  1.152e-04   1.799 0.072011 .  
## LoanAmount        -7.525e-03  4.023e-03  -1.871 0.061383 .  
## Loan_Amount_Term  -4.842e-04  1.937e-03  -0.250 0.802634    
## Credit_History     4.215e+00  4.910e-01   8.585  < 2e-16 ***
## Urban             -8.709e-01  3.097e-01  -2.812 0.004920 ** 
## Rural             -1.012e+00  3.054e-01  -3.315 0.000917 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 641.76  on 521  degrees of freedom
## Residual deviance: 449.00  on 509  degrees of freedom
## AIC: 475
## 
## Number of Fisher Scoring iterations: 5
stepAIC(reg2, direction = "backward")
## Start:  AIC=475
## Loan_Status ~ Gender + Married + Dependents + Education + Self_Employed + 
##     ApplicantIncome + CoapplicantIncome + LoanAmount + Loan_Amount_Term + 
##     Credit_History + Urban + Rural
## 
##                     Df Deviance    AIC
## - Dependents         1   449.05 473.05
## - Loan_Amount_Term   1   449.07 473.07
## - Self_Employed      1   449.75 473.75
## - Gender             1   449.89 473.89
## - ApplicantIncome    1   450.33 474.33
## <none>                   449.00 475.00
## - Married            1   451.29 475.29
## - Education          1   452.06 476.06
## - CoapplicantIncome  1   452.44 476.44
## - LoanAmount         1   452.60 476.60
## - Urban              1   457.15 481.15
## - Rural              1   460.41 484.41
## - Credit_History     1   612.00 636.00
## 
## Step:  AIC=473.05
## Loan_Status ~ Gender + Married + Education + Self_Employed + 
##     ApplicantIncome + CoapplicantIncome + LoanAmount + Loan_Amount_Term + 
##     Credit_History + Urban + Rural
## 
##                     Df Deviance    AIC
## - Loan_Amount_Term   1   449.10 471.10
## - Self_Employed      1   449.80 471.80
## - Gender             1   449.91 471.91
## - ApplicantIncome    1   450.36 472.36
## <none>                   449.05 473.05
## - Married            1   451.40 473.40
## - Education          1   452.19 474.19
## - CoapplicantIncome  1   452.61 474.61
## - LoanAmount         1   452.68 474.68
## - Urban              1   457.16 479.16
## - Rural              1   460.41 482.41
## - Credit_History     1   612.38 634.38
## 
## Step:  AIC=471.1
## Loan_Status ~ Gender + Married + Education + Self_Employed + 
##     ApplicantIncome + CoapplicantIncome + LoanAmount + Credit_History + 
##     Urban + Rural
## 
##                     Df Deviance    AIC
## - Self_Employed      1   449.84 469.84
## - Gender             1   449.99 469.99
## - ApplicantIncome    1   450.53 470.53
## <none>                   449.10 471.10
## - Married            1   451.54 471.54
## - Education          1   452.19 472.19
## - CoapplicantIncome  1   452.74 472.74
## - LoanAmount         1   452.92 472.92
## - Urban              1   457.16 477.16
## - Rural              1   460.52 480.52
## - Credit_History     1   612.51 632.51
## 
## Step:  AIC=469.84
## Loan_Status ~ Gender + Married + Education + ApplicantIncome + 
##     CoapplicantIncome + LoanAmount + Credit_History + Urban + 
##     Rural
## 
##                     Df Deviance    AIC
## - Gender             1   450.71 468.71
## - ApplicantIncome    1   451.01 469.01
## <none>                   449.84 469.84
## - Married            1   452.26 470.26
## - Education          1   453.01 471.01
## - CoapplicantIncome  1   453.51 471.51
## - LoanAmount         1   453.72 471.72
## - Urban              1   457.64 475.64
## - Rural              1   461.13 479.13
## - Credit_History     1   612.67 630.67
## 
## Step:  AIC=468.71
## Loan_Status ~ Married + Education + ApplicantIncome + CoapplicantIncome + 
##     LoanAmount + Credit_History + Urban + Rural
## 
##                     Df Deviance    AIC
## - ApplicantIncome    1   452.12 468.12
## <none>                   450.71 468.71
## - Education          1   453.59 469.59
## - LoanAmount         1   454.60 470.60
## - Married            1   454.69 470.69
## - CoapplicantIncome  1   455.10 471.10
## - Urban              1   458.03 474.03
## - Rural              1   461.66 477.66
## - Credit_History     1   613.48 629.48
## 
## Step:  AIC=468.12
## Loan_Status ~ Married + Education + CoapplicantIncome + LoanAmount + 
##     Credit_History + Urban + Rural
## 
##                     Df Deviance    AIC
## <none>                   452.12 468.12
## - LoanAmount         1   454.60 468.60
## - CoapplicantIncome  1   455.12 469.12
## - Education          1   455.52 469.52
## - Married            1   456.08 470.08
## - Urban              1   459.42 473.42
## - Rural              1   462.44 476.44
## - Credit_History     1   616.89 630.89
## 
## Call:  glm(formula = Loan_Status ~ Married + Education + CoapplicantIncome + 
##     LoanAmount + Credit_History + Urban + Rural, family = binomial, 
##     data = loan_data_R)
## 
## Coefficients:
##       (Intercept)            Married          Education  
##        -2.4622823          0.5058005          0.4995745  
## CoapplicantIncome         LoanAmount     Credit_History  
##         0.0001608         -0.0049479          4.2136553  
##             Urban              Rural  
##        -0.8111698         -0.9516449  
## 
## Degrees of Freedom: 521 Total (i.e. Null);  514 Residual
## Null Deviance:       641.8 
## Residual Deviance: 452.1     AIC: 468.1

Fit the lasso regression model

x<-cbind(loan_data_R$Married, loan_data_R$Education, loan_data_R$CoapplicantIncome,
    loan_data_R$LoanAmount, loan_data_R$Credit_History, loan_data_R$Urban, loan_data_R$Rural,
    loan_data_R$Gender, loan_data_R$Dependents, loan_data_R$Self_Employed, loan_data_R$ApplicantIncome, loan_data_R$Loan_Amount_Term +loan_data_R$Semiurban)  # Combine variables by column

y<-loan_data_R$Loan_Status

grid<-10^seq(10,-2, length=100)        # Create a grid of lambda values
lasso.mod=cv.glmnet(x,y,lambda=grid,   # Build a CV lasso regression          
nfold=length(y),                       # nfold=sample size, leave-one-out CV
alpha=1)                               # alpha=0, lasso reg is fit
## Warning: Option grouped=FALSE enforced in cv.glmnet, since < 3 observations
## per fold
## Ignore the received warning which recommends leaving 3-or-more out in CV ## 
#Warning message:
#Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per fold 

plot(log10(lasso.mod$lambda), lasso.mod$cvm,      # Plot average CV error versus log(lambda)
xlab="log10(Lambda)", ylab="CV Error")                     
abline(v = log10(lasso.mod$lambda.min), lty = 3)

(lambda=lasso.mod$lambda.min)        # The lambda that minimizes CV error
## [1] 0.01
predict(lasso.mod,s=lambda,          # Obtain lasso reg coefs
type="coefficients")
## 13 x 1 sparse Matrix of class "dgCMatrix"
##                         1
## (Intercept)  0.0784336732
## V1           0.0435157043
## V2           0.0502408734
## V3           0.0000123858
## V4          -0.0002629468
## V5           0.7011271760
## V6          -0.0615330894
## V7          -0.0940765495
## V8           0.0229686712
## V9           .           
## V10         -0.0074614315
## V11          .           
## V12          .
#Confusion Matrix
fit1234 = predict(lasso.mod, newx = x, type = 'response') 
# If results are more than 50% then convert to 1 else 0
fit1234 = ifelse(fit1234 >=0.5,1,0) #Setting cut-off to be at 0.5
# Evaluate predictions on the training dataset through Confusion Matrix
cnf_matrix2 = table(predicted = fit1234, actual = loan_data_R$Loan_Status)
cnf_matrix2
##          actual
## predicted   0   1
##         0  72   5
##         1  87 358
TN2 = cnf_matrix2[1,1] # True Negative - Actual & Predicted is 0/N
TP2 = cnf_matrix2[2,2] # True Positive - Actual & Predicted is 1/Y
FP2 = cnf_matrix2[2,1] # False Positive - Actual is 0/N but Predicted is 1/Y
FN2 = cnf_matrix2[1,2] # False Nefgative - Actual is 1/Y but Predicted is 0/N
TO2 = TN2+TP2+FP2+FN2 # Total Observations

accuracy2 = (TP2+TN2)/TO2 # Accuracy or Prevalance of Confusion Matrix
accuracy2
## [1] 0.8237548
precision2 = TP2/(TP2+FP2) 
precision2
## [1] 0.8044944
sensitivity2 = TP2/(TP2+FN2) 
sensitivity2
## [1] 0.9862259
error2 = (FP2+FN2)/TO2 
error2
## [1] 0.1762452
specificity2 = TN2/(TN2+FP2)
specificity2    
## [1] 0.4528302
#G-mean
G2=sqrt(specificity2*sensitivity2) 
G2
## [1] 0.668276